{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 多元线性回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[165349.2 136897.8 471784.1 'New York']\n",
      " [162597.7 151377.59 443898.53 'California']\n",
      " [153441.51 101145.55 407934.54 'Florida']\n",
      " [144372.41 118671.85 383199.62 'New York']\n",
      " [142107.34 91391.77 366168.42 'Florida']\n",
      " [131876.9 99814.71 362861.36 'New York']\n",
      " [134615.46 147198.87 127716.82 'California']\n",
      " [130298.13 145530.06 323876.68 'Florida']\n",
      " [120542.52 148718.95 311613.29 'New York']\n",
      " [123334.88 108679.17 304981.62 'California']\n",
      " [101913.08 110594.11 229160.95 'Florida']\n",
      " [100671.96 91790.61 249744.55 'California']\n",
      " [93863.75 127320.38 249839.44 'Florida']\n",
      " [91992.39 135495.07 252664.93 'California']\n",
      " [119943.24 156547.42 256512.92 'Florida']\n",
      " [114523.61 122616.84 261776.23 'New York']\n",
      " [78013.11 121597.55 264346.06 'California']\n",
      " [94657.16 145077.58 282574.31 'New York']\n",
      " [91749.16 114175.79 294919.57 'Florida']\n",
      " [86419.7 153514.11 0.0 'New York']\n",
      " [76253.86 113867.3 298664.47 'California']\n",
      " [78389.47 153773.43 299737.29 'New York']\n",
      " [73994.56 122782.75 303319.26 'Florida']\n",
      " [67532.53 105751.03 304768.73 'Florida']\n",
      " [77044.01 99281.34 140574.81 'New York']\n",
      " [64664.71 139553.16 137962.62 'California']\n",
      " [75328.87 144135.98 134050.07 'Florida']\n",
      " [72107.6 127864.55 353183.81 'New York']\n",
      " [66051.52 182645.56 118148.2 'Florida']\n",
      " [65605.48 153032.06 107138.38 'New York']\n",
      " [61994.48 115641.28 91131.24 'Florida']\n",
      " [61136.38 152701.92 88218.23 'New York']\n",
      " [63408.86 129219.61 46085.25 'California']\n",
      " [55493.95 103057.49 214634.81 'Florida']\n",
      " [46426.07 157693.92 210797.67 'California']\n",
      " [46014.02 85047.44 205517.64 'New York']\n",
      " [28663.76 127056.21 201126.82 'Florida']\n",
      " [44069.95 51283.14 197029.42 'California']\n",
      " [20229.59 65947.93 185265.1 'New York']\n",
      " [38558.51 82982.09 174999.3 'California']\n",
      " [28754.33 118546.05 172795.67 'California']\n",
      " [27892.92 84710.77 164470.71 'Florida']\n",
      " [23640.93 96189.63 148001.11 'California']\n",
      " [15505.73 127382.3 35534.17 'New York']\n",
      " [22177.74 154806.14 28334.72 'California']\n",
      " [1000.23 124153.04 1903.93 'New York']\n",
      " [1315.46 115816.21 297114.46 'Florida']\n",
      " [0.0 135426.92 0.0 'California']\n",
      " [542.05 51743.15 0.0 'New York']\n",
      " [0.0 116983.8 45173.06 'California']]\n",
      "[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51\n",
      " 155752.6  152211.77 149759.96 146121.95 144259.4  141585.52 134307.35\n",
      " 132602.65 129917.04 126992.93 125370.37 124266.9  122776.86 118474.03\n",
      " 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31\n",
      " 103282.38 101004.64  99937.59  97483.56  97427.84  96778.92  96712.8\n",
      "  96479.51  90708.19  89949.14  81229.06  81005.76  78239.91  77798.83\n",
      "  71498.49  69758.98  65200.33  64926.08  49490.75  42559.73  35673.41\n",
      "  14681.4 ]\n"
     ]
    }
   ],
   "source": [
    "dataset  = pd.read_csv('../datasets/50_Startups.csv')\n",
    "X = dataset.iloc[ : , : -1].values\n",
    "Y = dataset.iloc[ : , 4].values\n",
    "print(X)\n",
    "print(Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数字化类别数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.0 0.0 1.0 165349.2 136897.8 471784.1]\n",
      " [1.0 0.0 0.0 162597.7 151377.59 443898.53]\n",
      " [0.0 1.0 0.0 153441.51 101145.55 407934.54]\n",
      " [0.0 0.0 1.0 144372.41 118671.85 383199.62]\n",
      " [0.0 1.0 0.0 142107.34 91391.77 366168.42]\n",
      " [0.0 0.0 1.0 131876.9 99814.71 362861.36]\n",
      " [1.0 0.0 0.0 134615.46 147198.87 127716.82]\n",
      " [0.0 1.0 0.0 130298.13 145530.06 323876.68]\n",
      " [0.0 0.0 1.0 120542.52 148718.95 311613.29]\n",
      " [1.0 0.0 0.0 123334.88 108679.17 304981.62]\n",
      " [0.0 1.0 0.0 101913.08 110594.11 229160.95]\n",
      " [1.0 0.0 0.0 100671.96 91790.61 249744.55]\n",
      " [0.0 1.0 0.0 93863.75 127320.38 249839.44]\n",
      " [1.0 0.0 0.0 91992.39 135495.07 252664.93]\n",
      " [0.0 1.0 0.0 119943.24 156547.42 256512.92]\n",
      " [0.0 0.0 1.0 114523.61 122616.84 261776.23]\n",
      " [1.0 0.0 0.0 78013.11 121597.55 264346.06]\n",
      " [0.0 0.0 1.0 94657.16 145077.58 282574.31]\n",
      " [0.0 1.0 0.0 91749.16 114175.79 294919.57]\n",
      " [0.0 0.0 1.0 86419.7 153514.11 0.0]\n",
      " [1.0 0.0 0.0 76253.86 113867.3 298664.47]\n",
      " [0.0 0.0 1.0 78389.47 153773.43 299737.29]\n",
      " [0.0 1.0 0.0 73994.56 122782.75 303319.26]\n",
      " [0.0 1.0 0.0 67532.53 105751.03 304768.73]\n",
      " [0.0 0.0 1.0 77044.01 99281.34 140574.81]\n",
      " [1.0 0.0 0.0 64664.71 139553.16 137962.62]\n",
      " [0.0 1.0 0.0 75328.87 144135.98 134050.07]\n",
      " [0.0 0.0 1.0 72107.6 127864.55 353183.81]\n",
      " [0.0 1.0 0.0 66051.52 182645.56 118148.2]\n",
      " [0.0 0.0 1.0 65605.48 153032.06 107138.38]\n",
      " [0.0 1.0 0.0 61994.48 115641.28 91131.24]\n",
      " [0.0 0.0 1.0 61136.38 152701.92 88218.23]\n",
      " [1.0 0.0 0.0 63408.86 129219.61 46085.25]\n",
      " [0.0 1.0 0.0 55493.95 103057.49 214634.81]\n",
      " [1.0 0.0 0.0 46426.07 157693.92 210797.67]\n",
      " [0.0 0.0 1.0 46014.02 85047.44 205517.64]\n",
      " [0.0 1.0 0.0 28663.76 127056.21 201126.82]\n",
      " [1.0 0.0 0.0 44069.95 51283.14 197029.42]\n",
      " [0.0 0.0 1.0 20229.59 65947.93 185265.1]\n",
      " [1.0 0.0 0.0 38558.51 82982.09 174999.3]\n",
      " [1.0 0.0 0.0 28754.33 118546.05 172795.67]\n",
      " [0.0 1.0 0.0 27892.92 84710.77 164470.71]\n",
      " [1.0 0.0 0.0 23640.93 96189.63 148001.11]\n",
      " [0.0 0.0 1.0 15505.73 127382.3 35534.17]\n",
      " [1.0 0.0 0.0 22177.74 154806.14 28334.72]\n",
      " [0.0 0.0 1.0 1000.23 124153.04 1903.93]\n",
      " [0.0 1.0 0.0 1315.46 115816.21 297114.46]\n",
      " [1.0 0.0 0.0 0.0 135426.92 0.0]\n",
      " [0.0 0.0 1.0 542.05 51743.15 0.0]\n",
      " [1.0 0.0 0.0 0.0 116983.8 45173.06]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
    "labelencoder = LabelEncoder()\n",
    "X[ : , 3] = labelencoder.fit_transform(X[ : , 3])\n",
    "onehotencoder = ColumnTransformer([(\"city\",OneHotEncoder(),[3])],remainder = 'passthrough')\n",
    "X = onehotencoder.fit_transform(X)\n",
    "print(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 拆分数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 避免虚拟向量陷阱"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "指当原特征有m个类别时，如果将其转换成m个虚拟变量，就会导致变量间出现完全共线性的情况。\n",
    "假设我们有一个特征“性别”，包含男性和女性两个类别，如果将此特征转换为2个虚拟变量，就是：男x1=[1,0]，女x2=[0,1]，意思就是：变量x1，当性别为男时，x1=1，否则x1=0；变量x2，当性别为女时，x2=1，否则x2=0。这样，目标y=w1x1+w2x2+b。因为x1+x2=1，因此，变量x1和变量x2之间存在线性关系，同时使用这两个变量将会导致共线性问题，使得模型参数无法估计。\n",
    "解决的办法是：把目标y变成y=w1(x1+x2)+(w2-w1)x2+b=(w2-w1)x2+w1+b，意思就是把其中一个变量作为基准（这里是把“男”作为基准），将其从目标方程式中删去，这样只通过一个变量x2就能推导出所有信息，x2=1就表示性别为女，x2=0则表示性别为男。\n",
    "还有一点需要注意的是，基准类别该如何选择？如果基准类别选择不合理，虚拟变量之间仍然会存在共线性的问题。这里直接给出结论：选择占比最大的类别作为基准类别。假设有a，b，c三个类别，如果基准类别a占比太少，那么即使把a去除，b和c之和也会接近于1。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "X1 = X[: , 1:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 拆分数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.0 1.0 0.0 66051.52 182645.56 118148.2]\n",
      " [1.0 0.0 0.0 100671.96 91790.61 249744.55]\n",
      " [0.0 1.0 0.0 101913.08 110594.11 229160.95]\n",
      " [0.0 1.0 0.0 27892.92 84710.77 164470.71]\n",
      " [0.0 1.0 0.0 153441.51 101145.55 407934.54]\n",
      " [0.0 0.0 1.0 72107.6 127864.55 353183.81]\n",
      " [0.0 0.0 1.0 20229.59 65947.93 185265.1]\n",
      " [0.0 0.0 1.0 61136.38 152701.92 88218.23]\n",
      " [0.0 1.0 0.0 73994.56 122782.75 303319.26]\n",
      " [0.0 1.0 0.0 142107.34 91391.77 366168.42]]\n",
      "[103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06\n",
      "  97483.56 110352.25 166187.94]\n",
      "[[1.0 0.0 66051.52 182645.56 118148.2]\n",
      " [0.0 0.0 100671.96 91790.61 249744.55]\n",
      " [1.0 0.0 101913.08 110594.11 229160.95]\n",
      " [1.0 0.0 27892.92 84710.77 164470.71]\n",
      " [1.0 0.0 153441.51 101145.55 407934.54]\n",
      " [0.0 1.0 72107.6 127864.55 353183.81]\n",
      " [0.0 1.0 20229.59 65947.93 185265.1]\n",
      " [0.0 1.0 61136.38 152701.92 88218.23]\n",
      " [1.0 0.0 73994.56 122782.75 303319.26]\n",
      " [1.0 0.0 142107.34 91391.77 366168.42]]\n",
      "[103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06\n",
      "  97483.56 110352.25 166187.94]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)\n",
    "X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y, test_size = 0.2, random_state = 0)\n",
    "print(X_test)\n",
    "print(Y_test)\n",
    "print(X1_test)\n",
    "print(Y1_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 在训练集上训练多元线性回归模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression()"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "regressor = LinearRegression()\n",
    "regressor.fit(X_train,Y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[103015.20159797 132582.27760813 132447.73845173  71976.0985126\n",
      " 178537.4822105  116161.24230166  67851.69209679  98791.73374689\n",
      " 113969.43533012 167921.06569546]\n"
     ]
    }
   ],
   "source": [
    "y_pred = regressor.predict(X_test)\n",
    "print(y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
